require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Hmisc::getHdata(nhgh)
d1 <- nhgh %>%
filter(sex == "female") %>%
filter(age >= 18) %>%
select(gh, ht) %>%
filter(1:n()<=1000)
We want to figure out which kind of distribution fits the data well. In this blog, we choose 3 kinds of distribution, normal, gamma and weibull distribution. These three distribution are the most common distribution.
We use MM and MLE to calculate the parameters for each distribution and then make some plots to find out which distribution fits the data.
Method of Moments:the method of moments is a method of estimation of population parameters.(From wikipedia)
1a. Choose a parametric distribution, \(F_X(x|\theta)\)
1b. Identify the distribution parameters, \(\theta\)
Calculate/find distribution moments (need as many moments as distribution parameters), \(E[X^k]\) or \(E[(X - E[X])^k]\)
Calculate sample moments.
Create system of equations equating distribution moments to sample moments.
Solve system of equations for distribution parameters.
\(\hat{F}_X = F_X(x | \theta = \hat{\theta})\)
In statistics, maximum likelihood estimation (MLE) is a method of estimating the parameters of a probability distribution by maximizing a likelihood function, so that under the assumed statistical model the observed data is most probable. The point in the parameter space that maximizes the likelihood function is called the maximum likelihood estimate.(From wikipedia)
mm.gamma.shape = mean(d1$ht)^2/var(d1$ht)
mm.gamma.scale=var(d1$ht)/mean(d1$h)
mm.norm.mean=mean(d1$ht)
mm.norm.sd=sd(d1$ht)
mean.weib = function(lambda, k){
lambda*gamma(1 + 1/k)
}
lambda = function(samp.mean, k){
samp.mean/gamma(1+1/k)
}
var.weib = function(samp.mean, k,samp.var){
(lambda(samp.mean, k))^2*(gamma(1+2/k)-(gamma(1+1/k))^2)-samp.var
}
mm.opt = optimize(f = function(x){abs(var.weib(k = x, samp.mean = mean(d1$ht), samp.var = var(d1$ht)))},lower = 10, upper = 100)
mm.weib.k = mm.opt$minimum
mm.weib.lambda= lambda(samp.mean = mean(d1$ht), k=mm.weib.k)
hist(d1$ht, main = "Height of Adult Females: MM", breaks = 100, freq = FALSE)
curve(dgamma(x, shape = mm.gamma.shape, scale = mm.gamma.scale), add = TRUE, col = "red")
curve(dnorm(x, mm.norm.mean, mm.norm.sd), add = TRUE, col = "blue", lwd = 2)
curve(dweibull(x, shape = mm.weib.k, scale = mm.weib.lambda),add = TRUE, col = "green",lwd = 2)
plot(ecdf(d1$ht),main = "The CDF and ECDF of ht: MM")
curve(pgamma(x, shape = mm.gamma.shape, scale = mm.gamma.scale), add = TRUE, col = "red",lwd = 6)
curve(pnorm(x, mm.norm.mean, mm.norm.sd), add = TRUE, col = "blue", lwd = 4)
curve(pweibull(x, shape = mm.weib.k, scale = mm.weib.lambda),add = TRUE, col = "green",lwd = 4)
p = ppoints(300)
y = quantile(d1$ht, probs = p)
#gamma
x.gamma.mm = qgamma(p, shape = mm.gamma.shape, scale = mm.gamma.scale)
plot(x.gamma.mm,y,main = "Gamma QQplot")
abline(0,1)
#norm
x.norm.mm = qnorm(p, mm.norm.mean, mm.norm.sd)
plot(x.norm.mm,y,main = "Norm QQplot")
abline(0,1)
#weibull
x.pwei.mm=qweibull(p, shape = mm.weib.k, scale = mm.weib.lambda)
plot(x.pwei.mm,y,main = "Weibull QQplot")
abline(0,1)
median(d1$ht)
## [1] 160.8
qweibull(0.5, shape = mm.weib.k, scale = mm.weib.lambda)
## [1] 161.8065
qgamma(0.5, shape = mm.gamma.shape, scale = mm.gamma.scale)
## [1] 160.6308
qnorm(0.5, mm.norm.mean, mm.norm.sd)
## [1] 160.7419
#gamma
med.gam = NA
for (i in 1:1000) {
data = rgamma(200, shape = mm.gamma.shape, scale = mm.gamma.scale)
med.gam[i] = median(data)
}
hist(med.gam,breaks = 100,main = "Histogram of median of gamma distribution")
#norm
med.norm = NA
for (i in 1:1000) {
data = rnorm(200, mm.norm.mean, mm.norm.sd)
med.norm[i] = median(data)
}
hist(med.norm,breaks = 100,main = "Histogram of median of normal distribution")
#
med.wei = NA
for (i in 1:1000) {
data = rweibull(200, shape = mm.weib.k, scale = mm.weib.lambda)
med.wei[i] = median(data)
}
hist(med.wei,breaks = 100,main = "Histogram of median of weibull distribution")
diff(quantile(med.gam,probs = c(0.025,0.975)))
## 97.5%
## 2.415832
diff(quantile(med.norm,probs = c(0.025,0.975)))
## 97.5%
## 2.602738
diff(quantile(med.wei,probs = c(0.025,0.975)))
## 97.5%
## 2.396322
library(stats4)
nLL <- function(mean, sd){
fs <- dnorm(
x = d1$ht
,mean = mean
,sd = sd
,log = TRUE
)
-sum(fs)
}
fit <- mle(
nLL
,start = list(mean = 160, sd = 5)
,method = "L-BFGS-B"
,lower = c(0,0.01)
)
nLL.gamma <- function(shape, scale){
fs <- dgamma(
x = d1$ht
,shape = shape
,scale = scale
,log = TRUE
)
-sum(fs)
}
fit.gamma <- mle(
nLL.gamma
,start = list(shape = 1, scale = 1)
,method = "L-BFGS-B"
,lower = c(0,0.01)
)
nLL.weibull <- function(shape, scale){
fs <- dweibull(
x = d1$ht
,shape = shape
,scale = scale
,log = TRUE
)
-sum(fs)
}
fit.weibull <- mle(
nLL.weibull
,start = list(shape = 1, scale = 1)
,method = "L-BFGS-B"
,lower = c(0,0.01)
)
hist(d1$ht,breaks = 100, freq = FALSE,,main = "Overlay estimated pdf onto histogram of Normal distribution")
curve(dnorm(x,mean = coef(fit)[1],sd =coef(fit)[2]),col = "red" ,add = TRUE, lwd = 4)
hist(d1$ht,breaks = 100, freq = FALSE,,main = "Overlay estimated pdf onto histogram of Gamma distribution")
curve(dgamma(x,shape = coef(fit.gamma)[1],scale =coef(fit.gamma)[2]),col = "blue" ,add = TRUE, lwd = 4)
hist(d1$ht,breaks = 100, freq = FALSE,main = "Overlay estimated pdf onto histogram of Weibull distribution")
curve(dweibull(x,shape = coef(fit.weibull)[1],scale =coef(fit.weibull)[2]),col = "green" ,add = TRUE, lwd = 4)
plot(ecdf(d1$ht),,main = "Overlay estimated CDF onto eCDF of normal distribution")
curve(pnorm(x,mean = coef(fit)[1],sd =coef(fit)[2]),col = "red" ,add = TRUE, lwd = 4)
plot(ecdf(d1$ht),,main = "Overlay estimated CDF onto eCDF of gamma distribution")
curve(pgamma(x,shape = coef(fit.gamma)[1],scale =coef(fit.gamma)[2]),col = "blue" ,add = TRUE, lwd = 4)
plot(ecdf(d1$ht),,main = "Overlay estimated CDF onto eCDF of weibull distribution")
curve(pweibull(x,shape = coef(fit.weibull)[1],scale =coef(fit.weibull)[2]),col = "green" ,add = TRUE, lwd = 4)
qs <- seq(0.05,0.95,length = 100)
samp_qs <- quantile(d1$ht,qs)
theor_qs <- qnorm(qs,mean = coef(fit)[1],sd =coef(fit)[2])
plot(samp_qs,theor_qs)
abline(0,1)
qs <- seq(0.05,0.95,length = 100)
samp_qs.gamma <- quantile(d1$ht,qs)
theor_qs.gamma <- qgamma(qs,shape = coef(fit.gamma)[1],scale =coef(fit.gamma)[2])
plot(samp_qs.gamma,theor_qs.gamma)
abline(0,1)
qs <- seq(0.05,0.95,length = 100)
samp_qs.w <- quantile(d1$ht,qs)
theor_qs.w <- qweibull(qs,shape = coef(fit.weibull)[1],scale =coef(fit.weibull)[2])
plot(samp_qs.w,theor_qs.w)
abline(0,1)
qnorm(0.5,mean = coef(fit)[1],sd = coef(fit)[2])
## [1] 160.7419
qgamma(0.5,shape = coef(fit.gamma)[1],scale =coef(fit.gamma)[2])
## [1] 160.6312
qweibull(0.5,shape = coef(fit.weibull)[1],scale =coef(fit.weibull)[2])
## [1] 161.5156
M <- 500
N <- 1000
out <- rnorm(N*M,mean = coef(fit)[1],sd =coef(fit)[2]) %>% array(dim = c(M,N))
samp_dist <- apply(out,1,median)
hist(samp_dist,breaks = 100,,main = "Median Samp Dist (hist) of normal distribution")
M <- 500
N <- 1000
out <- rgamma(N*M,shape = coef(fit.gamma)[1],scale =coef(fit.gamma)[2]) %>% array(dim = c(M,N))
samp_dist.g <- apply(out,1,median)
hist(samp_dist.g,breaks = 100,main = "Median Samp Dist (hist) of gamma distribution")
M <- 500
N <- 1000
out <- rweibull(N*M,shape = coef(fit.weibull)[1],scale =coef(fit.weibull)[2]) %>% array(dim = c(M,N))
samp_dist.w <- apply(out,1,median)
hist(samp_dist.w,breaks = 100,main = "Median Samp Dist (hist) of weibull distribution")
### Range of middle 95% of Samp Dist
diff(quantile(samp_dist,c(0.05/2,1-0.05/2)))
## 97.5%
## 1.070803
diff(quantile(samp_dist.g,c(0.05/2,1-0.05/2)))
## 97.5%
## 1.114164
diff(quantile(samp_dist.w,c(0.05/2,1-0.05/2)))
## 97.5%
## 1.257542
#gamma
mm.gamma.shape.g = mean(d1$gh)^2/var(d1$gh)
mm.gamma.scale.g=var(d1$gh)/mean(d1$gh)
#normal
mm.norm.mean.g=mean(d1$gh)
mm.norm.sd.g=sd(d1$gh)
#weibull
mean.weib.g = function(lambda, k){
lambda*gamma(1 + 1/k)
}
lambda.g = function(samp.mean, k){
samp.mean/gamma(1+1/k)
}
var.weib.g = function(samp.mean, k,samp.var){
(lambda(samp.mean, k))^2*(gamma(1+2/k)-(gamma(1+1/k))^2)-samp.var
}
mm.opt.g = optimize(f = function(x){abs(var.weib(k = x, samp.mean = mean(d1$gh), samp.var = var(d1$gh)))},lower = 10, upper = 100)
mm.weib.k.g = mm.opt.g$minimum
mm.weib.lambda.g= lambda(samp.mean = mean(d1$gh), k=mm.weib.k)
hist(d1$gh, main = "Height of Adult Females: MM", breaks = 100, freq = FALSE,xlim = c(3,12))
curve(dgamma(x, shape = mm.gamma.shape.g, scale = mm.gamma.scale.g), add = TRUE, col = "red")
curve(dnorm(x, mm.norm.mean.g, mm.norm.sd.g), add = TRUE, col = "blue", lwd = 2)
curve(dweibull(x, shape = mm.weib.k.g, scale = mm.weib.lambda.g),add = TRUE, col = "green",lwd = 2)
plot(ecdf(d1$gh),main = "The CDF and ECDF of ht: MM")
curve(pgamma(x, shape = mm.gamma.shape.g, scale = mm.gamma.scale.g), add = TRUE, col = "red",lwd = 6)
curve(pnorm(x, mm.norm.mean.g, mm.norm.sd.g), add = TRUE, col = "blue", lwd = 4)
curve(pweibull(x, shape = mm.weib.k.g, scale = mm.weib.lambda.g),add = TRUE, col = "green",lwd = 4)
p = ppoints(3000)
y.g = quantile(d1$gh, probs = p)
#gamma
x.gamma.mm.g = qgamma(p, shape = mm.gamma.shape.g, scale = mm.gamma.scale.g)
plot(x.gamma.mm.g,y.g,main = "Gamma QQplot")
abline(0,1)
#norm
x.norm.mm.g = qnorm(p, mm.norm.mean.g, mm.norm.sd.g)
plot(x.norm.mm.g,y.g,main = "Norm QQplot")
abline(0,1)
#weibull
x.pwei.mm.g=qweibull(p, shape = mm.weib.k.g, scale = mm.weib.lambda.g)
plot(x.pwei.mm.g,y.g,main = "Weibull QQplot")
abline(0,1)
median(d1$gh)
## [1] 5.5
qweibull(0.5, shape = mm.weib.k.g, scale = mm.weib.lambda.g)
## [1] 5.629781
qgamma(0.5, shape = mm.gamma.shape.g, scale = mm.gamma.scale.g)
## [1] 5.660259
qnorm(0.5, mm.norm.mean.g, mm.norm.sd.g)
## [1] 5.7246
#gamma
med.gam.g = NA
for (i in 1:1000) {
data = rgamma(200, shape = mm.gamma.shape.g, scale = mm.gamma.scale.g)
med.gam.g[i] = median(data)
}
hist(med.gam.g,breaks = 100,main = "Histogram of median of gamma distribution")
#norm
med.norm.g = NA
for (i in 1:1000) {
data = rnorm(200, mm.norm.mean.g, mm.norm.sd.g)
med.norm.g[i] = median(data)
}
hist(med.norm.g,breaks = 100,main = "Histogram of median of normal distribution")
#
med.wei.g = NA
for (i in 1:1000) {
data = rweibull(200, shape = mm.weib.k.g, scale = mm.weib.lambda.g)
med.wei.g[i] = median(data)
}
hist(med.wei.g,breaks = 100,main = "Histogram of median of weibull distribution")
diff(quantile(med.gam.g,probs = c(0.025,0.975)))
## 97.5%
## 0.3623804
diff(quantile(med.norm.g,probs = c(0.025,0.975)))
## 97.5%
## 0.3743458
diff(quantile(med.wei.g,probs = c(0.025,0.975)))
## 97.5%
## 0.2178256
library(stats4)
nLL.g <- function(mean, sd){
fs <- dnorm(
x = d1$gh
,mean = mean
,sd = sd
,log = TRUE
)
-sum(fs)
}
fit.g <- mle(
nLL.g
,start = list(mean = 1, sd = 1)
,method = "L-BFGS-B"
,lower = c(0,0.01)
)
nLL.gamma.g <- function(shape, scale){
fs <- dgamma(
x = d1$gh
,shape = shape
,scale = scale
,log = TRUE
)
-sum(fs)
}
fit.gamma.g <- mle(
nLL.gamma.g
,start = list(shape = 1, scale = 1)
,method = "L-BFGS-B"
,lower = c(0,0.01)
)
nLL.weibull.g <- function(shape, scale){
fs <- dweibull(
x = d1$gh
,shape = shape
,scale = scale
,log = TRUE
)
-sum(fs)
}
fit.weibull.g <- mle(
nLL.weibull.g
,start = list(shape = 1, scale = 1)
,method = "L-BFGS-B"
,lower = c(0,0.01)
)
hist(d1$gh,breaks = 100, freq = FALSE,xlim = c(4,12),main = "Overlay estimated pdf onto histogram of Normal distribution")
curve(dnorm(x,mean = coef(fit.g)[1],sd =coef(fit.g)[2]),col = "red" ,add = TRUE, lwd = 4)
hist(d1$gh,breaks = 100, freq = FALSE,,main = "Overlay estimated pdf onto histogram of Gamma distribution")
curve(dgamma(x,shape = coef(fit.gamma.g)[1],scale =coef(fit.gamma.g)[2]),col = "blue" ,add = TRUE, lwd = 4)
hist(d1$gh,breaks = 100, freq = FALSE,main = "Overlay estimated pdf onto histogram of Weibull distribution")
curve(dweibull(x,shape = coef(fit.weibull.g)[1],scale =coef(fit.weibull.g)[2]),col = "green" ,add = TRUE, lwd = 4)
plot(ecdf(d1$gh),,main = "Overlay estimated CDF onto eCDF of normal distribution")
curve(pnorm(x,mean = coef(fit.g)[1],sd =coef(fit.g)[2]),col = "red" ,add = TRUE, lwd = 4)
plot(ecdf(d1$gh),,main = "Overlay estimated CDF onto eCDF of gamma distribution")
curve(pgamma(x,shape = coef(fit.gamma.g)[1],scale =coef(fit.gamma.g)[2]),col = "blue" ,add = TRUE, lwd = 4)
plot(ecdf(d1$gh),,main = "Overlay estimated CDF onto eCDF of weibull distribution")
curve(pweibull(x,shape = coef(fit.weibull.g)[1],scale =coef(fit.weibull.g)[2]),col = "green" ,add = TRUE, lwd = 4)
qs <- seq(0.05,0.95,length = 5000)
samp_qs.g <- quantile(d1$gh,qs)
theor_qs.g <- qnorm(qs,mean = coef(fit.g)[1],sd =coef(fit.g)[2])
plot(samp_qs.g,theor_qs.g)
abline(0,1)
qs <- seq(0.05,0.95,length = 5000)
samp_qs.gamma.g <- quantile(d1$gh,qs)
theor_qs.gamma.g <- qgamma(qs,shape = coef(fit.gamma.g)[1],scale =coef(fit.gamma.g)[2])
plot(samp_qs.gamma.g,theor_qs.gamma.g)
abline(0,1)
qs <- seq(0.05,0.95,length = 100)
samp_qs.w.g <- quantile(d1$ht,qs)
theor_qs.w.g <- qweibull(qs,shape = coef(fit.weibull.g)[1],scale =coef(fit.weibull.g)[2])
plot(samp_qs.w.g,theor_qs.w.g)
abline(0,1)
qnorm(0.5,mean = coef(fit.g)[1],sd = coef(fit.g)[2])
## [1] 5.7246
qgamma(0.5,shape = coef(fit.gamma.g)[1],scale =coef(fit.gamma.g)[2])
## [1] 5.677983
qweibull(0.5,shape = coef(fit.weibull.g)[1],scale =coef(fit.weibull.g)[2])
## [1] 5.64902
M <- 500
N <- 1000
out.g <- rnorm(N*M,mean = coef(fit.g)[1],sd =coef(fit.g)[2]) %>% array(dim = c(M,N))
samp_dist.g.g <- apply(out.g,1,median)
hist(samp_dist.g.g,breaks = 100,,main = "Median Samp Dist (hist) of normal distribution")
M <- 500
N <- 1000
out.gamma.g <- rgamma(N*M,shape = coef(fit.gamma.g)[1],scale =coef(fit.gamma.g)[2]) %>% array(dim = c(M,N))
samp_dist.gamma.g <- apply(out.gamma.g,1,median)
hist(samp_dist.gamma.g,breaks = 100,main = "Median Samp Dist (hist) of gamma distribution")
M <- 500
N <- 1000
out.w.g <- rweibull(N*M,shape = coef(fit.weibull.g)[1],scale =coef(fit.weibull.g)[2]) %>% array(dim = c(M,N))
samp_dist.w.g <- apply(out.w.g,1,median)
hist(samp_dist.w.g,breaks = 100,main = "Median Samp Dist (hist) of weibull distribution")
diff(quantile(samp_dist.g.g,c(0.05/2,1-0.05/2)))
## 97.5%
## 0.150214
diff(quantile(samp_dist.gamma.g,c(0.05/2,1-0.05/2)))
## 97.5%
## 0.1338429
diff(quantile(samp_dist.w.g,c(0.05/2,1-0.05/2)))
## 97.5%
## 0.2222051
According to Plots, normal and gamma distribution could fit the data in both MM and MLE, the weibull distribution might could not fit the data very well in MM and MLE. But the weibull distribution might fit the data better in MLE than MM.
According to Plots, none of the distribution could fit the data very well.